package com.example.book.crawler.pageprocessor;

import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.UUID;

public class ContentPageProcessor implements PageProcessor {

    private Site site = Site.me().setRetryTimes(5).setSleepTime(5000).setTimeOut(10000)
            .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36");

    @Override
    public void process(Page page) {

        StringBuffer stringBuffer = new StringBuffer();


        String content = page.getHtml().xpath("//*[@id=\"htmlContent\"]/html()").toString();

        if (StringUtils.isNotBlank(content)){
            stringBuffer.append(content);
        }

        String name = page.getHtml().xpath("//*[@id=\"linkNext\"]/text()").toString();
        String url = page.getUrl().replace(".html", "") + "_2.html";

        if ("下一页".equals(name)) {
            Spider spider = new Spider(new NextPageHandler());
            spider.setUUID(UUID.randomUUID().toString());
            spider.addUrl(url);
            spider.addPipeline(new Pipeline() {
                @Override
                public void process(ResultItems resultItems, Task task) {
                    String text = resultItems.get("content");
                    if (text != null) {
                        stringBuffer.append(text);
                    }
                }
            });
            spider.thread(1).run();
            spider.clearPipeline().close();
        }
        page.putField("content", stringBuffer.toString());
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {

        Spider spider = new Spider(new ContentPageProcessor());
        spider.setUUID(UUID.randomUUID().toString());
        spider.addUrl("https://www.wmtxt.net/book/55970/33124741.html");
        spider.addPipeline((resultItems, task) -> {
            String content = resultItems.get("content");
            System.out.println(">>>>" + content);
        });
        spider.thread(1);
        spider.run();
    }
}

class NextPageHandler implements PageProcessor {

    private Site site = Site.me().setRetryTimes(5).setSleepTime(5000).setTimeOut(10000)
            .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36");

    @Override
    public void process(Page page) {
        String content = page.getHtml().xpath("//*[@id=\"htmlContent\"]/html()").toString();
        content.replace(" 无名小说网 www.wmtxt.net，最快更新", "")
                .replace("<a href=\"https://www.wmtxt.net/book/55970/\">美女总裁的贴身兵王</a>", "")
                .replace("最新章节！", "")
                .replace("    &nbs... -->>", "")
                .replace("<div class=\"show-app2\" onclick=\"window.location.href='http://go2021-1955280580.bceapp.com/go123.php'\">" +
                        "<div class=\"show-app2-content\">" +
                        "<div class=\"show-app2-cover\">" +
                        "<img src=\"https://zhengxin-pub.cdn.bcebos.com/logopic/195d991373608e23a2bfa88132ad9779_fullsize.jpg\">" +
                        "</div>" +
                        "<div class=\"show-app2-detail\">" +
                        "<p>请安装我们的客户端</p>" +
                        "<p><span class=\"show-pc\">更多好书 </span>" +
                        "离线下载 无广告阅读</p>" +
                        "</div></div>" +
                        "<div class=\"show-app2-button\"><div><strong>下载APP</strong></div><div>终身免费阅读</div></div><div class=\"clear\"></div></div>", "")
                .replace("<p class=\"text-danger text-center mg0\">本章未完，点击下一页继续阅读</p>", "");
        page.putField("content", content);
    }

    @Override
    public Site getSite() {
        return site;
    }
}
